TL;DR

Given the results of the analysis of the Fluidigm data, here I explore the role of the V intercept in a more complex data (Allen).

We want to check if the intercept acts as a size factor here too and, if true, if W capture some interesting biology.

Interestingly, in this more complex dataset, both intercepts are needed to get a clear picture of the data in two dimensions.

Data

As a first pass, I will only focus on the cells that passed the QC filters (by the original authors) and that were part of the “core” clusters. I will color-code the cells by either known cell type or by inferred cluster (inferred in the original study).

Select cells, remove ERCC spike-ins, filter out the genes that do not have at least 10 counts in at least 10 cells.

data("allen")
allen_core <- allen[grep("^ERCC-", rownames(allen), invert = TRUE),
                    which(colData(allen)$Core.Type=="Core")]

filter <- apply(assay(allen_core)>10, 1, sum)>=10

Number of retained genes:

print(sum(filter))
## [1] 11545

To speed up the computations, I will focus on the top 1,000 most variable genes.

allen_core <- allen_core[filter,]
core <- assay(allen_core)

vars <- rowVars(log1p(core))
names(vars) <- rownames(core)
vars <- sort(vars, decreasing = TRUE)
core <- core[names(vars)[1:1000],]

First, let’s look at PCA (of the log counts) for reference.

par(mfrow=c(1, 2))
bio <- as.factor(colData(allen_core)$driver_1_s)
cl <- as.factor(colData(allen_core)$Primary.Type)

detection_rate <- colSums(core>0)
coverage <- colSums(core)

pca <- prcomp(t(log1p(core)))
plot(pca$x, col=cols[bio], pch=19, main="PCA of log-counts, centered not scaled")
legend("bottomleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19, main="PCA of log-counts, centered not scaled")

df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                        PC1         PC2 detection_rate   coverage
## PC1             1.00000000 -0.01052424      0.8066812  0.4453172
## PC2            -0.01052424  1.00000000     -0.3601716 -0.3286485
## detection_rate  0.80668124 -0.36017158      1.0000000  0.5275845
## coverage        0.44531717 -0.32864852      0.5275845  1.0000000

V intercept in both Mu and Pi

print(system.time(zinb_Vall <- zinbFit(core, ncores = 3, K = 2)))
##    user  system elapsed 
## 106.854  30.021  64.516

Plot the results with cells colored according to their biological condition.

par(mfrow=c(1, 2))
plot(zinb_Vall@W, col=cols[bio], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottomright", levels(bio), fill=cols)

plot(zinb_Vall@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("topright", levels(cl), fill=cols2)

Explore Gamma estimates

One interpretation of the model is that \(\gamma_mu\) will act as a “size factor”. Here, we check whether it captures sequencing depth and/or detection rate.

#total number of detected genes in the cell
df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         W1          W2   gamma_mu   gamma_pi
## W1              1.00000000  0.03514491 -0.0843733  0.3598978
## W2              0.03514491  1.00000000 -0.3676352  0.3710653
## gamma_mu       -0.08437330 -0.36763518  1.0000000 -0.3348686
## gamma_pi        0.35989777  0.37106533 -0.3348686  1.0000000
## detection_rate -0.35188976 -0.34026613  0.3712522 -0.9939701
## coverage       -0.05250198 -0.05649406  0.8504471 -0.4846195
##                detection_rate    coverage
## W1                 -0.3518898 -0.05250198
## W2                 -0.3402661 -0.05649406
## gamma_mu            0.3712522  0.85044711
## gamma_pi           -0.9939701 -0.48461953
## detection_rate      1.0000000  0.52758451
## coverage            0.5275845  1.00000000

\(\gamma_pi\) clearly captures the detection rate and \(\gamma_mu\) clearly captures the coverage. Interestingly, the two estimates are positively correlated.

par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vall)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_Vall))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(2, 2))
boxplot(zinb_Vall@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vall@beta_pi[1,], main="Beta_Pi")

plot(t(zinb_Vall@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vall@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

No V intercept

print(system.time(zinb_Vnone <- zinbFit(core, ncores = 3, K = 2, V=matrix(0, ncol=1, nrow=NROW(core)))))
##    user  system elapsed 
## 120.871  30.694  59.448
par(mfrow=c(1, 2))
plot(zinb_Vnone@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_Vnone@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")

df <- data.frame(W1 = zinb_Vnone@W[,1], W2 = zinb_Vnone@W[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                          W1          W2 detection_rate   coverage
## W1              1.000000000 0.005426967     -0.4105128 -0.1328474
## W2              0.005426967 1.000000000      0.8736250  0.5865832
## detection_rate -0.410512821 0.873624999      1.0000000  0.5275845
## coverage       -0.132847434 0.586583172      0.5275845  1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vnone)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_Vnone))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(2, 2))
boxplot(zinb_Vnone@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vnone@beta_pi[1,], main="Beta_Pi")

plot(t(zinb_Vnone@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vnone@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

V intercept only in Mu

V <- cbind(rep(0, NROW(core)), rep(1, NROW(core)))

print(system.time(zinb_Vmu <- zinbFit(core, ncores = 3, K = 2, V=V, which_V_mu=2L, which_V_pi=1L)))
##    user  system elapsed 
## 117.983  30.208  62.073
par(mfrow=c(1, 2))
plot(zinb_Vmu@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
plot(zinb_Vmu@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")

df <- data.frame(W1 = zinb_Vmu@W[,1], W2 = zinb_Vmu@W[,2], gamma_mu = zinb_Vmu@gamma_mu[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         W1          W2  gamma_mu detection_rate   coverage
## W1              1.00000000 -0.06020466 0.0485493     -0.2663277 -0.1083201
## W2             -0.06020466  1.00000000 0.5593379      0.8898885  0.3772103
## gamma_mu        0.04854930  0.55933792 1.0000000      0.5880261  0.9235989
## detection_rate -0.26632771  0.88988849 0.5880261      1.0000000  0.5275845
## coverage       -0.10832007  0.37721026 0.9235989      0.5275845  1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vmu)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_Vmu))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(2, 2))
boxplot(zinb_Vmu@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vmu@beta_pi[1,], main="Beta_Pi")

plot(t(zinb_Vmu@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vmu@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

V intercept only in Pi

print(system.time(zinb_Vpi <- zinbFit(core, ncores = 3, K = 2, V=V, which_V_mu=1L, which_V_pi=2L)))
##    user  system elapsed 
## 152.040  40.292  80.801
par(mfrow=c(1, 2))
plot(zinb_Vpi@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
plot(zinb_Vpi@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")

df <- data.frame(W1 = zinb_Vpi@W[,1], W2 = zinb_Vpi@W[,2], gamma_pi = zinb_Vpi@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         W1         W2    gamma_pi detection_rate
## W1              1.00000000  0.1305085  0.09593638    -0.09884142
## W2              0.13050848  1.0000000 -0.37656124     0.27518468
## gamma_pi        0.09593638 -0.3765612  1.00000000    -0.98667190
## detection_rate -0.09884142  0.2751847 -0.98667190     1.00000000
## coverage        0.15676103 -0.2568486 -0.47896136     0.52758451
##                  coverage
## W1              0.1567610
## W2             -0.2568486
## gamma_pi       -0.4789614
## detection_rate  0.5275845
## coverage        1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vpi)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])

plot(rowMeans(log1p(getMu(zinb_Vpi))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(2, 2))
boxplot(zinb_Vpi@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vpi@beta_pi[1,], main="Beta_Pi")

plot(t(zinb_Vpi@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vpi@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

Do we need more than two dimensions?

print(system.time(zinb_3 <- zinbFit(core, ncores = 3, K = 3)))
##    user  system elapsed 
## 121.030  32.532  61.484
pairs(zinb_3@W, col=cols[bio], pch=19)

pairs(zinb_3@W, col=cols2[cl], pch=19)

## write matrices to file to feed to ZIFA in python
write.csv(log1p(core), file="allen.csv")

Using all genes

coreAll <- assay(allen_core)
dim(coreAll)
## [1] 11545   285
print(system.time(zinb_all <- zinbFit(coreAll, ncores = 3, K = 2)))
##     user   system  elapsed 
## 1175.281  154.257  574.979
par(mfrow=c(1, 2))
plot(zinb_all@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
plot(zinb_all@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")

Interestingly, when using all the genes, things don’t work nicely anymore. I wonder if the difference is the selection of the most variable genes, rather than the complexity of the data.

ZIFA (top 1000 most variable genes)

Full algorithm was used. We got very similar results using block algorithm.

import sys
# modify python path to your python packages directory
sys.path.insert(1, '/usr/local/lib/python2.7/site-packages/')

from pandas import read_csv
from ZIFA import ZIFA
import numpy as np

df = read_csv('allen.csv')
del df['Unnamed: 0']
lc = np.array(df)
Y = np.transpose(lc)
Z, model_params  = ZIFA.fitModel(Y, 2)

np.savetxt('allen_zifa.csv', Z, delimiter=',')
## Running zero-inflated factor analysis with N = 285, D = 1000, K = 2
## Param change below threshold 1.000e-02 after 6 iterations
par(mfrow=c(1, 2))
zifa_res <- read.csv("allen_zifa.csv", header=FALSE)

plot(zifa_res, col=cols[bio], pch=19, xlab="Z1", ylab="Z2")
plot(zifa_res, col=cols2[cl], pch=19, xlab="Z1", ylab="Z2")

wrapRzifa <- function(Y, block = F){
  # wrapper R function for ZIFA.
  # md5 hashing and temporary files are used not to re-run zifa 
  # if it has already be run on this computer.
  d = digest(Y, "md5")
  tmp = paste0(tempdir(), '/', d)
  write.csv(Y, paste0(tmp, '.csv'))
  
  if (!file.exists(paste0(tmp, '_zifa.csv'))){
    print('run ZIFA')
    bb = ifelse(block, '-b ', '')
    cmd = sprintf('python run_zifa.py %s%s.csv %s_zifa.csv', bb, tmp, tmp)
    system(cmd)
  }
  read.csv(sprintf("%s_zifa.csv", tmp), header=FALSE)
}

Y = log2(core + 1)
zifa_res = wrapRzifa(Y)
plot(zifa_res, col=cols[bio], pch=19, xlab="Z1", ylab="Z2")
plot(zifa_res, col=cols2[cl], pch=19, xlab="Z1", ylab="Z2")
df <- data.frame(Z1 = zifa_res$V1,
                 Z2 = zifa_res$V2,
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         Z1         Z2 detection_rate   coverage
## Z1              1.00000000 0.07493093     -0.3890703 -0.2343373
## Z2              0.07493093 1.00000000      0.6859741  0.4423235
## detection_rate -0.38907034 0.68597407      1.0000000  0.5275845
## coverage       -0.23433727 0.44232350      0.5275845  1.0000000
df <- data.frame(W1 = zinb_Vall@W[,1],
                 W2 = zinb_Vall@W[,2],
                 Z1 = zifa_res$V1,
                 Z2 = zifa_res$V2)
pairs(df, pch=19, col=cols[bio], main = 'Intercept in both Mu and Pi')

print(cor(df, method="spearman"))
##             W1          W2          Z1          Z2
## W1  1.00000000  0.03514491  0.87737303 -0.06011083
## W2  0.03514491  1.00000000 -0.27471165 -0.77121359
## Z1  0.87737303 -0.27471165  1.00000000  0.07493093
## Z2 -0.06011083 -0.77121359  0.07493093  1.00000000
df <- data.frame(W1 = zinb_Vnone@W[,1],
                 W2 = zinb_Vnone@W[,2],
                 Z1 = zifa_res$V1,
                 Z2 = zifa_res$V2)
pairs(df, pch=19, col=cols[bio], main = 'No intercept')

print(cor(df, method="spearman"))
##             W1           W2           Z1         Z2
## W1 1.000000000  0.005426967  0.940024468 0.11583976
## W2 0.005426967  1.000000000 -0.008162482 0.88366985
## Z1 0.940024468 -0.008162482  1.000000000 0.07493093
## Z2 0.115839757  0.883669851  0.074930925 1.00000000
df <- data.frame(W1 = zinb_Vmu@W[,1],
                 W2 = zinb_Vmu@W[,2],
                 Z1 = zifa_res$V1,
                 Z2 = zifa_res$V2)
pairs(df, pch=19, col=cols[bio], main = 'Intercept in Mu')

print(cor(df, method="spearman"))
##             W1          W2          Z1         Z2
## W1  1.00000000 -0.06020466  0.93880730 0.27101971
## W2 -0.06020466  1.00000000 -0.21813102 0.85510842
## Z1  0.93880730 -0.21813102  1.00000000 0.07493093
## Z2  0.27101971  0.85510842  0.07493093 1.00000000
df <- data.frame(W1 = zinb_Vpi@W[,1],
                 W2 = zinb_Vpi@W[,2],
                 Z1 = zifa_res$V1,
                 Z2 = zifa_res$V2)
pairs(df, pch=19, col=cols[bio], main = 'Intercept in Pi')

print(cor(df, method="spearman"))
##           W1        W2         Z1         Z2
## W1 1.0000000 0.1305085 0.88529862 0.37835330
## W2 0.1305085 1.0000000 0.13259032 0.63760776
## Z1 0.8852986 0.1325903 1.00000000 0.07493093
## Z2 0.3783533 0.6376078 0.07493093 1.00000000

GC and length

library(biomaRt)
mart = useMart("ensembl")
mart = useDataset("hsapiens_gene_ensembl", mart = mart)
attrs = c("hgnc_symbol", "entrezgene")
bm = getBM(attributes=attrs, mart = mart)
coreGC = core
rownames(coreGC) = toupper(rownames(core))
bm = bm[match(rownames(coreGC), bm[,1]),]
bm = na.omit(bm)
gene_info = getGeneLengthAndGCContent(as.character(bm[,2]), "hg38", mode="org.db")
rownames(gene_info) = bm[,1]
gene_info = na.omit(gene_info)
coreGC = coreGC[rownames(gene_info),]
dim(core)
## [1] 1000  285
dim(coreGC)
## [1] 902 285
par(mfrow=c(1,2))
gene_detection_rate = rowSums(coreGC>0)/ncol(coreGC)
plot(gene_info[,'length'], gene_detection_rate, xlab = 'Gene Length',
     ylab = 'Gene detection rate')
plot(gene_info[,'gc'], gene_detection_rate, xlab = 'GC',
     ylab = 'Gene detection rate')

GC and length in both Vmu and Vpi

We keep intercept in both Vmu and Vpi and add GC and length to both Vmu and Vpi.

V = cbind(rep(1, NROW(coreGC)), gene_info)
print(system.time(zinb_gc <- zinbFit(coreGC, ncores = 3, K = 2, V = V)))
##    user  system elapsed 
## 129.046  12.763  60.878
plot(zinb_gc@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_gc@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

df <- data.frame(W1=zinb_gc@W[,1],
                 W2=zinb_gc@W[,2],
                 gamma_mu = zinb_gc@gamma_mu[1,],
                 gamma_pi = zinb_gc@gamma_pi[1,],
                 detection_rate=detection_rate,
                 coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         W1          W2    gamma_mu    gamma_pi
## W1              1.00000000 -0.54605587 -0.09412048  0.31023239
## W2             -0.54605587  1.00000000  0.12451440 -0.10371060
## gamma_mu       -0.09412048  0.12451440  1.00000000 -0.03403039
## gamma_pi        0.31023239 -0.10371060 -0.03403039  1.00000000
## detection_rate -0.42328832  0.04964245  0.17586323 -0.63587802
## coverage       -0.06754498  0.03298118  0.58049215 -0.27384906
##                detection_rate    coverage
## W1                -0.42328832 -0.06754498
## W2                 0.04964245  0.03298118
## gamma_mu           0.17586323  0.58049215
## gamma_pi          -0.63587802 -0.27384906
## detection_rate     1.00000000  0.52758451
## coverage           0.52758451  1.00000000
df <- data.frame(alphaMu1=zinb_gc@alpha_mu[1,],
                 alphaMu2=zinb_gc@alpha_mu[2,],
                 alphaPi1=zinb_gc@alpha_pi[1,],
                 alphaPi2=zinb_gc@alpha_pi[2,],
                 gene_detection_rate=gene_detection_rate)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                        alphaMu1    alphaMu2    alphaPi1    alphaPi2
## alphaMu1             1.00000000 -0.09034811 -0.67360284  0.05224543
## alphaMu2            -0.09034811  1.00000000  0.03646762 -0.62179332
## alphaPi1            -0.67360284  0.03646762  1.00000000  0.01937837
## alphaPi2             0.05224543 -0.62179332  0.01937837  1.00000000
## gene_detection_rate -0.15790047 -0.03702463  0.17796001  0.06782407
##                     gene_detection_rate
## alphaMu1                    -0.15790047
## alphaMu2                    -0.03702463
## alphaPi1                     0.17796001
## alphaPi2                     0.06782407
## gene_detection_rate          1.00000000

GC and length in Vmu only

We keep intercept in both Vmu and Vpi and add GC and length only in Vmu.

V = cbind(rep(1, NROW(coreGC)), gene_info)
print(system.time(zinb_gc <- zinbFit(coreGC, ncores = 3, K = 2, V = V,
                                     which_V_mu=1:3, which_V_pi=1L)))
##    user  system elapsed 
## 112.998  11.981  52.600
plot(zinb_gc@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_gc@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

df <- data.frame(W1=zinb_gc@W[,1],
                 W2=zinb_gc@W[,2],
                 gamma_mu = zinb_gc@gamma_mu[1,],
                 gamma_pi = zinb_gc@gamma_pi[1,],
                 detection_rate=detection_rate,
                 coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                        W1          W2   gamma_mu    gamma_pi
## W1              1.0000000 -0.56477422 -0.1032456  0.43943455
## W2             -0.5647742  1.00000000  0.1316033 -0.05278191
## gamma_mu       -0.1032456  0.13160331  1.0000000 -0.13746261
## gamma_pi        0.4394345 -0.05278191 -0.1374626  1.00000000
## detection_rate -0.4389583  0.06428882  0.1800088 -0.99334074
## coverage       -0.0802143  0.02731886  0.5801925 -0.48232309
##                detection_rate    coverage
## W1                -0.43895825 -0.08021430
## W2                 0.06428882  0.02731886
## gamma_mu           0.18000881  0.58019253
## gamma_pi          -0.99334074 -0.48232309
## detection_rate     1.00000000  0.52758451
## coverage           0.52758451  1.00000000
df <- data.frame(alphaMu1=zinb_gc@alpha_mu[1,],
                 alphaMu2=zinb_gc@alpha_mu[2,],
                 alphaPi1=zinb_gc@alpha_pi[1,],
                 alphaPi2=zinb_gc@alpha_pi[2,],
                 gene_detection_rate=gene_detection_rate)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                        alphaMu1    alphaMu2    alphaPi1    alphaPi2
## alphaMu1             1.00000000 -0.08721723 -0.67039583  0.05977531
## alphaMu2            -0.08721723  1.00000000  0.03998078 -0.62303895
## alphaPi1            -0.67039583  0.03998078  1.00000000  0.01100490
## alphaPi2             0.05977531 -0.62303895  0.01100490  1.00000000
## gene_detection_rate -0.15733394 -0.04386984  0.18069937  0.05793565
##                     gene_detection_rate
## alphaMu1                    -0.15733394
## alphaMu2                    -0.04386984
## alphaPi1                     0.18069937
## alphaPi2                     0.05793565
## gene_detection_rate          1.00000000

GC and length in Vpi only

We keep intercept in both Vmu and Vpi and add GC and length only in Vpi.

V = cbind(rep(1, NROW(coreGC)), gene_info)
print(system.time(zinb_gc <- zinbFit(coreGC, ncores = 3, K = 2, V = V,
                                     which_V_mu=1L, which_V_pi=1:3)))
##    user  system elapsed 
## 147.383  18.599 106.261
plot(zinb_gc@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_gc@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

df <- data.frame(W1=zinb_gc@W[,1],
                 W2=zinb_gc@W[,2],
                 gamma_mu = zinb_gc@gamma_mu[1,],
                 gamma_pi = zinb_gc@gamma_pi[1,],
                 detection_rate=detection_rate,
                 coverage=coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                         W1          W2    gamma_mu   gamma_pi
## W1              1.00000000  0.24532650  0.08201828  0.1390043
## W2              0.24532650  1.00000000  0.36545537 -0.2722286
## gamma_mu        0.08201828  0.36545537  1.00000000 -0.1374051
## gamma_pi        0.13900429 -0.27222859 -0.13740507  1.0000000
## detection_rate -0.14661948  0.43639636  0.33830194 -0.6260981
## coverage       -0.04965709  0.06601056  0.82969617 -0.2656031
##                detection_rate    coverage
## W1                 -0.1466195 -0.04965709
## W2                  0.4363964  0.06601056
## gamma_mu            0.3383019  0.82969617
## gamma_pi           -0.6260981 -0.26560311
## detection_rate      1.0000000  0.52758451
## coverage            0.5275845  1.00000000
df <- data.frame(alphaMu1=zinb_gc@alpha_mu[1,],
                 alphaMu2=zinb_gc@alpha_mu[2,],
                 alphaPi1=zinb_gc@alpha_pi[1,],
                 alphaPi2=zinb_gc@alpha_pi[2,],
                 gene_detection_rate=gene_detection_rate)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                          alphaMu1    alphaMu2    alphaPi1    alphaPi2
## alphaMu1             1.0000000000 -0.04339095 -0.60120643  0.05916533
## alphaMu2            -0.0433909522  1.00000000  0.10041003 -0.60652687
## alphaPi1            -0.6012064296  0.10041003  1.00000000 -0.15392199
## alphaPi2             0.0591653284 -0.60652687 -0.15392199  1.00000000
## gene_detection_rate -0.0004593927  0.16423501  0.06517973 -0.17809078
##                     gene_detection_rate
## alphaMu1                  -0.0004593927
## alphaMu2                   0.1642350091
## alphaPi1                   0.0651797333
## alphaPi2                  -0.1780907788
## gene_detection_rate        1.0000000000